{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 使用逻辑回归方法解决音乐推荐问题\n",
    "\n",
    "# 数据探索特征工程"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import pickle as pk\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import math\n",
    "import scipy.io as sio\n",
    "import scipy.sparse as ss\n",
    "import copy\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = '../Data/'  # 文件路径\n",
    "model_path = '../model/' # 模型路径"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>registration_init_time</th>\n",
       "      <th>expiration_date</th>\n",
       "      <th>song_length</th>\n",
       "      <th>language</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>7.377418e+06</td>\n",
       "      <td>7.377418e+06</td>\n",
       "      <td>7.377418e+06</td>\n",
       "      <td>7.377418e+06</td>\n",
       "      <td>7.377418e+06</td>\n",
       "      <td>7.377418e+06</td>\n",
       "      <td>7.377304e+06</td>\n",
       "      <td>7.377268e+06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>5.035171e-01</td>\n",
       "      <td>7.511399e+00</td>\n",
       "      <td>1.753927e+01</td>\n",
       "      <td>6.794068e+00</td>\n",
       "      <td>2.012810e+07</td>\n",
       "      <td>2.017157e+07</td>\n",
       "      <td>2.451210e+05</td>\n",
       "      <td>1.860933e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>4.999877e-01</td>\n",
       "      <td>6.641625e+00</td>\n",
       "      <td>2.155447e+01</td>\n",
       "      <td>2.275774e+00</td>\n",
       "      <td>3.017281e+04</td>\n",
       "      <td>3.869831e+03</td>\n",
       "      <td>6.734471e+04</td>\n",
       "      <td>2.117681e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>-4.300000e+01</td>\n",
       "      <td>3.000000e+00</td>\n",
       "      <td>2.004033e+07</td>\n",
       "      <td>1.970010e+07</td>\n",
       "      <td>1.393000e+03</td>\n",
       "      <td>-1.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>4.000000e+00</td>\n",
       "      <td>2.011070e+07</td>\n",
       "      <td>2.017091e+07</td>\n",
       "      <td>2.147260e+05</td>\n",
       "      <td>3.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>5.000000e+00</td>\n",
       "      <td>2.100000e+01</td>\n",
       "      <td>7.000000e+00</td>\n",
       "      <td>2.013102e+07</td>\n",
       "      <td>2.017093e+07</td>\n",
       "      <td>2.418120e+05</td>\n",
       "      <td>3.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.300000e+01</td>\n",
       "      <td>2.900000e+01</td>\n",
       "      <td>9.000000e+00</td>\n",
       "      <td>2.015102e+07</td>\n",
       "      <td>2.017101e+07</td>\n",
       "      <td>2.721600e+05</td>\n",
       "      <td>5.200000e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>2.200000e+01</td>\n",
       "      <td>1.051000e+03</td>\n",
       "      <td>1.300000e+01</td>\n",
       "      <td>2.017013e+07</td>\n",
       "      <td>2.020102e+07</td>\n",
       "      <td>1.085171e+07</td>\n",
       "      <td>5.900000e+01</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             target          city            bd  registered_via  \\\n",
       "count  7.377418e+06  7.377418e+06  7.377418e+06    7.377418e+06   \n",
       "mean   5.035171e-01  7.511399e+00  1.753927e+01    6.794068e+00   \n",
       "std    4.999877e-01  6.641625e+00  2.155447e+01    2.275774e+00   \n",
       "min    0.000000e+00  1.000000e+00 -4.300000e+01    3.000000e+00   \n",
       "25%    0.000000e+00  1.000000e+00  0.000000e+00    4.000000e+00   \n",
       "50%    1.000000e+00  5.000000e+00  2.100000e+01    7.000000e+00   \n",
       "75%    1.000000e+00  1.300000e+01  2.900000e+01    9.000000e+00   \n",
       "max    1.000000e+00  2.200000e+01  1.051000e+03    1.300000e+01   \n",
       "\n",
       "       registration_init_time  expiration_date   song_length      language  \n",
       "count            7.377418e+06     7.377418e+06  7.377304e+06  7.377268e+06  \n",
       "mean             2.012810e+07     2.017157e+07  2.451210e+05  1.860933e+01  \n",
       "std              3.017281e+04     3.869831e+03  6.734471e+04  2.117681e+01  \n",
       "min              2.004033e+07     1.970010e+07  1.393000e+03 -1.000000e+00  \n",
       "25%              2.011070e+07     2.017091e+07  2.147260e+05  3.000000e+00  \n",
       "50%              2.013102e+07     2.017093e+07  2.418120e+05  3.000000e+00  \n",
       "75%              2.015102e+07     2.017101e+07  2.721600e+05  5.200000e+01  \n",
       "max              2.017013e+07     2.020102e+07  1.085171e+07  5.900000e+01  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open(model_path+'data_all_train.pkl', 'rb') as fr:\n",
    "    data_all_train = pk.load(fr)\n",
    "fr.close()\n",
    "data_all_train.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>registration_init_time</th>\n",
       "      <th>expiration_date</th>\n",
       "      <th>song_length</th>\n",
       "      <th>genre_ids</th>\n",
       "      <th>artist_name</th>\n",
       "      <th>composer</th>\n",
       "      <th>lyricist</th>\n",
       "      <th>language</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>206471.0</td>\n",
       "      <td>359</td>\n",
       "      <td>Bastille</td>\n",
       "      <td>Dan Smith| Mark Crew</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Good Grief</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>284584.0</td>\n",
       "      <td>1259</td>\n",
       "      <td>Various Artists</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Lords of Cardboard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>225396.0</td>\n",
       "      <td>1259</td>\n",
       "      <td>Nas</td>\n",
       "      <td>N. Jones、W. Adams、J. Lordan、D. Ingle</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Hip Hop Is Dead(Album Version (Edited))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>255512.0</td>\n",
       "      <td>1019</td>\n",
       "      <td>Soundway</td>\n",
       "      <td>Kwadwo Donkoh</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>Disco Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>187802.0</td>\n",
       "      <td>1011</td>\n",
       "      <td>Brett Young</td>\n",
       "      <td>Brett Young| Kelly Archer| Justin Ebach</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Sleep Without You</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 msno  \\\n",
       "201969   FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1932462  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "183559   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "149511   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "74867    FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                              song_id source_system_tab  \\\n",
       "201969   BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
       "1932462  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
       "183559   JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
       "149511   2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
       "74867    3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
       "\n",
       "          source_screen_name      source_type  target  city  bd  gender  \\\n",
       "201969               Explore  online-playlist       1     1   0     NaN   \n",
       "1932462  Local playlist more   local-playlist       1    13  24  female   \n",
       "183559   Local playlist more   local-playlist       1    13  24  female   \n",
       "149511   Local playlist more   local-playlist       1    13  24  female   \n",
       "74867                Explore  online-playlist       1     1   0     NaN   \n",
       "\n",
       "         registered_via  registration_init_time  expiration_date  song_length  \\\n",
       "201969                7                20120102         20171005     206471.0   \n",
       "1932462               9                20110525         20170911     284584.0   \n",
       "183559                9                20110525         20170911     225396.0   \n",
       "149511                9                20110525         20170911     255512.0   \n",
       "74867                 7                20120102         20171005     187802.0   \n",
       "\n",
       "        genre_ids      artist_name                                 composer  \\\n",
       "201969        359         Bastille                     Dan Smith| Mark Crew   \n",
       "1932462      1259  Various Artists                                      NaN   \n",
       "183559       1259              Nas     N. Jones、W. Adams、J. Lordan、D. Ingle   \n",
       "149511       1019         Soundway                            Kwadwo Donkoh   \n",
       "74867        1011      Brett Young  Brett Young| Kelly Archer| Justin Ebach   \n",
       "\n",
       "        lyricist  language                                     name  \n",
       "201969       NaN      52.0                               Good Grief  \n",
       "1932462      NaN      52.0                       Lords of Cardboard  \n",
       "183559       NaN      52.0  Hip Hop Is Dead(Album Version (Edited))  \n",
       "149511       NaN      -1.0                             Disco Africa  \n",
       "74867        NaN      52.0                        Sleep Without You  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0, 0.5, 'Number of occurrences')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ4AAAEGCAYAAABVSfMhAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAbNUlEQVR4nO3dfbRddX3n8feHhKdRIUEurDQhTdTYMaJGuELUGUWgEOgqQQbasBxzF82aqBMU17IuwLZGniy2IDNMFSctKYljiZGKRBsa0/Dg2PKQi4RAeFi5BiTXZEgggEHGMAnf+WP/juxczj1nn2PO78R7P6+19jp7f/fv6bDC/a699+/8tiICMzOzXA7o9gDMzGx0ceIxM7OsnHjMzCwrJx4zM8vKicfMzLIa2+0B7O+OPPLImDJlSreHYWb2W+WBBx54NiJ66p1z4mliypQp9Pf3d3sYZma/VST9bLhzvtVmZmZZOfGYmVlWTjxmZpaVE4+ZmWXlxGNmZlk58ZiZWVZOPGZmlpUTj5mZZeXEY2ZmWXnlArNR7OnL39XtIdh+aPIXH+5o+77iMTOzrJx4zMwsK99qy+D4zy/t9hBsP/TAX8/t9hDMusJXPGZmlpUTj5mZZeXEY2ZmWTnxmJlZVk48ZmaWVccSj6RDJN0v6SFJGyRdluI3SXpS0rq0zUhxSbpe0oCk9ZKOK7XVJ2lj2vpK8eMlPZzqXC9JKX6EpNWp/GpJ45v1YWZmeXTyimcXcHJEvAeYAcySNDOd+3xEzEjbuhQ7A5iWtvnADVAkEWAhcCJwArCwlkhSmfmlerNS/BJgTURMA9ak42H7MDOzfDqWeKLwUjo8MG3RoMpsYGmqdy8wTtIE4HRgdUTsiIjngdUUSWwCcFhE3BMRASwFzi61tSTtLxkSr9eHmZll0tFnPJLGSFoHbKNIHvelU1elW13XSTo4xSYCm0vVB1OsUXywThzg6IjYCpA+j2rSx9Bxz5fUL6l/+/btLX1nMzNrrKOJJyL2RMQMYBJwgqRjgUuBfw+8DzgCuDgVV70m2og3UqlORCyKiN6I6O3p6WnSpJmZtSLLrLaIeAG4C5gVEVvTra5dwN9TPLeB4urjmFK1ScCWJvFJdeIAz9RuoaXPbU36MDOzTDo5q61H0ri0fyhwKvB4KSGI4tnLI6nKCmBumnk2E3gx3SZbBZwmaXyaVHAasCqd2ylpZmprLnBbqa3a7Le+IfF6fZiZWSadXCR0ArBE0hiKBLc8In4g6Q5JPRS3vdYBn0zlVwJnAgPAy8AFABGxQ9IVwNpU7vKI2JH2PwXcBBwK3J42gKuB5ZLmAU8D5zXqw8zM8ulY4omI9cB768RPHqZ8AAuGObcYWFwn3g8cWyf+HHBKK32YmVkeXrnAzMyycuIxM7OsnHjMzCwrJx4zM8vKicfMzLJy4jEzs6yceMzMLCsnHjMzy8qJx8zMsnLiMTOzrJx4zMwsKyceMzPLyonHzMyycuIxM7OsnHjMzCwrJx4zM8vKicfMzLJy4jEzs6w6lngkHSLpfkkPSdog6bIUnyrpPkkbJX1b0kEpfnA6Hkjnp5TaujTFn5B0eik+K8UGJF1Sirfch5mZ5dHJK55dwMkR8R5gBjBL0kzgK8B1ETENeB6Yl8rPA56PiLcB16VySJoOzAHeCcwCvi5pjKQxwNeAM4DpwPmpLK32YWZm+XQs8UThpXR4YNoCOBm4JcWXAGen/dnpmHT+FElK8WURsSsingQGgBPSNhARmyLiFWAZMDvVabUPMzPLpKPPeNKVyTpgG7Aa+CnwQkTsTkUGgYlpfyKwGSCdfxF4czk+pM5w8Te30YeZmWXS0cQTEXsiYgYwieIK5R31iqXPelcesQ/jjfrYi6T5kvol9W/fvr1OFTMza1fTxCPpDZIOSPtvl3SWpANb6SQiXgDuAmYC4ySNTacmAVvS/iBwTOpnLHA4sKMcH1JnuPizbfQxdLyLIqI3Inp7enpa+apmZtZElSueHwGHSJoIrAEuAG5qVklSj6Rxaf9Q4FTgMeBO4NxUrA+4Le2vSMek83dERKT4nDQjbSowDbgfWAtMSzPYDqKYgLAi1Wm1DzMzy2Rs8yIoIl6WNA/4HxHxV5IerFBvArAkzT47AFgeET+Q9CiwTNKVwIPAjan8jcA3JQ1QXIXMAYiIDZKWA48Cu4EFEbEHQNKFwCpgDLA4Ijakti5upQ8zM8unUuKR9H7gY7w2LblpvYhYD7y3TnwTxfOeofFfAecN09ZVwFV14iuBlfuiDzMzy6PKrbbPApcCt6arj7dQ3MoyMzNrWZUrl7uBuyW9IR1vAj7T6YGZmdnIVGVW2/vTc5nH0vF7JH294yMzM7MRqcqttv8GnA48BxARDwEf6uSgzMxs5Kr0A9KI2DwktKcDYzEzs1Ggyqy2zZI+AET6vcxnSLfdzMzMWlXliueTwAKKdc4GKVaaXtDJQZmZ2chVZVbbsxS/4TEzM/uNVZnVtqS29E06Hi9pcWeHZWZmI1WVW23vTot8AhARz1NnRQIzM7MqqiSeAySNrx1IOoJqkxLMzMxep0oCuRb4N0m1N3qeR51108zMzKqoMrlgqaQHgI9QvEjtnIh4tOMjMzOzEanqLbPHgedr5SVNjoinOzYqMzMbsZomHkmfBhYCz1CsWCCK10W/u7NDMzOzkajKFc9FwO9FxHOdHoyZmY18VWa1bQZe7PRAzMxsdKhyxbMJuEvSPwG7asGI+GrHRmVmZiNWlcTzdNoOSpuZmVnbmt5qi4jLIuIy4JrafjpuSNIxku6U9JikDZIuSvEvSfq5pHVpO7NU51JJA5KekHR6KT4rxQYkXVKKT5V0n6SNkr6dVs9G0sHpeCCdn9KsDzMzy6OTbyDdDXwuIt4BzAQWSJqezl0XETPStjK1Ox2YA7wTmAV8XdIYSWOArwFnANOB80vtfCW1NY1iuve8FJ8HPB8RbwOuS+WG7aPCdzEzs32kY28gjYitEfGTtL+TInFNbFBlNrAsInZFxJPAAHBC2gYiYlNEvAIsA2ZLEnAyUFtRYQlwdqmtJWn/FuCUVH64PszMLJMsbyBNt7reC9yXQhdKWi9pcWkduIkUM+hqBlNsuPibgRciYveQ+F5tpfMvpvLDtTV0vPMl9Uvq3759eytf1czMmqg0nbr8BlJJf0oLbyCV9EbgH4HPRsQvgBuAt1K8UG4rxVpwUPwwdahoI95OW3sHIhZFRG9E9Pb09NSpYmZm7eroG0glHUiRdL4VEd8FiIhnImJPRLwK/C2v3eoaBI4pVZ8EbGkQfxYYJ2nskPhebaXzhwM7GrRlZmaZNEw86cH7xyPiYxFxdEQcFRH/ucoqBumZyo3AY+Xf/EiaUCr2UeCRtL8CmJNmpE0FpgH3A2uBaWkG20EUkwNWREQAdwLnpvp9wG2ltvrS/rnAHan8cH2YmVkmDX/HExF7JM2mmBnWqg8CHwcelrQuxb5AMSttBsUtrqeAT6S+NkhaDjxKMSNuQUTsAZB0IbAKGAMsjogNqb2LgWWSrgQepEh0pM9vShqguNKZ06wPMzPLo8oPSP9V0t8A3wZ+WQvWZqwNJyJ+TP1nKisb1LmKOu/6SVOuX1cvIjZRZ1ZaRPyK4r1BlfswM7M8qiSeD6TPy0uxoJjKbGZm1pKGiUfSAcANEbE803jMzGyEazi5IM08uzDTWMzMbBSoMp16taQ/TWuvHVHbOj4yMzMbkao84/mT9Fn+7U4Ab9n3wzEzs5GuaeKJiKk5BmJmZqND08QjaW69eEQs3ffDMTOzka7Krbb3lfYPAU4BfgI48ZiZWcuq3Gr7dPlY0uHANzs2IjMzG9EqvRZhiJcp1jgzMzNrWZVnPN/ntVcHHEDxFlD/oNTMzNpS5RnPNaX93cDPImKwQ+MxM7MRrkrieRrYmhbeRNKhkqZExFMdHZmZmY1IVZ7xfAd4tXS8J8XMzMxaViXxjI2IV2oHaf+gzg3JzMxGsiqJZ7uks2oH6cVwz3ZuSGZmNpJVecbzSeBb6WVwAINA3dUMzMzMmqnyA9KfAjMlvRFQROzs/LDMzGykanqrTdKXJY2LiJciYqek8ZKuzDE4MzMbeao84zkjIl6oHUTE88CZzSql9/fcKekxSRskXZTiR0haLWlj+hyf4pJ0vaQBSeslHVdqqy+V3yiprxQ/XtLDqc71ktRuH2ZmlkeVxDNG0sG1A0mHAgc3KF+zG/hcRLwDmAkskDQduARYExHTgDXpGOAMiqV4pgHzgRtSf0cAC4ETgROAhbVEksrML9WbleIt9WFmZvlUSTz/C1gjaZ6kPwFWA0uaVYqIrRHxk7S/E3gMmAjMLtVfApyd9mcDS6NwLzBO0gTgdGB1ROxIV1urgVnp3GERcU9EBMVq2eW2WunDzMwyqTK54K8krQdOTaErImJVK51ImgK8F7gPODoitqa2t0o6KhWbCGwuVRtMsUbxwTpx2uhj65Dxzqe4ImLy5MmtfFUzM2uiynRqgAeBAykWC32wlQ7SbLh/BD4bEb9Ij2HqFq0TizbiDYdTpU5ELAIWAfT29jZr08zMWlBlVtsfAfcD5wJ/BNwn6dwqjUs6kCLpfCsivpvCz9Rub6XPbSk+CBxTqj4J2NIkPqlOvJ0+zMwskyrPeP4MeF9E9EXEXIoH/H/RrFKaYXYj8FhEfLV0agVQm5nWB9xWis9NM89mAi+m22WrgNPSNO7xwGnAqnRup6SZqa+5Q9pqpQ8zM8ukyq22AyJiW+n4OaolrA8CHwcelrQuxb4AXA0slzSPYuXr89K5lRTTtAcoXjZ3AUBE7JB0BbA2lbs8Inak/U8BNwGHArenjVb7MDOzfKoknn+WtAq4OR3/McUf8IYi4sfUf6YCcEqd8gEsGKatxcDiOvF+4Ng68eda7cPMzPKoMqvt85LOAf4DRSJZFBG3dnxkZmY2IlWa1ZYmBny3aUEzM7MmqjyrMTMz22eceMzMLKthE4+kNenzK/mGY2ZmI12jZzwTJH0YOEvSMobMUKutw2ZmZtaKRonnixSrOk8CvjrkXAAnd2pQZmY2cg2beCLiFuAWSX8REVdkHJOZmY1gVX7Hc4Wks4APpdBdEfGDzg7LzMxGqiqLhP4lcBHwaNouSjEzM7OWVfkB6R8AMyLiVQBJSyhejXBpJwdmZmYjU9Xf8Ywr7R/eiYGYmdnoUOWK5y+BByXdSTGl+kP4asfMzNpUZXLBzZLuAt5HkXgujoj/0+mBmZnZyFR1kdCtFC9RMzMz+414rTYzM8vKicfMzLJqmHgkHSDpkVyDMTOzka9h4km/3XlI0uRM4zEzsxGuyq22CcAGSWskrahtzSpJWixpW/mKSdKXJP1c0rq0nVk6d6mkAUlPSDq9FJ+VYgOSLinFp0q6T9JGSd+WdFCKH5yOB9L5Kc36MDOzfKrMaruszbZvAv4GWDokfl1EXFMOSJoOzAHeCfwO8C+S3p5Ofw34fWAQWCtpRUQ8CnwltbVM0jeAecAN6fP5iHibpDmp3B8P10dE7Gnz+5mZWRuaXvFExN3AU8CBaX8t0PRdPBHxI2BHxXHMBpZFxK6IeBIYAE5I20BEbIqIV4BlwGxJongtwy2p/hLg7FJbS9L+LcApqfxwfZiZWUZVFgn9LxR/wP9nCk0Evvcb9HmhpPXpVtz4UpubS2UGU2y4+JuBFyJi95D4Xm2l8y+m8sO19TqS5kvql9S/ffv29r6lmZnVVeUZzwLgg8AvACJiI3BUm/3dALwVmAFsBa5NcdUpG23E22nr9cGIRRHRGxG9PT099YqYmVmbqiSeXek2FwCSxjLMH+xmIuKZiNiTZsv9La/d6hoEjikVnQRsaRB/FhiXxlKO79VWOn84xS2/4doyM7OMqiSeuyV9AThU0u8D3wG+305nkiaUDj8K1Ga8rQDmpBlpU4FpwP0Uz5OmpRlsB1FMDlgREQHcCZyb6vcBt5Xa6kv75wJ3pPLD9WFmZhlVmdV2CcVMsYeBTwArgb9rVknSzcBJwJGSBoGFwEmSZlBcMT2V2iMiNkhaTvGiud3AgtpsM0kXAquAMcDiiNiQurgYWCbpSor3A92Y4jcC35Q0QHGlM6dZH2Zmlk+V1alfTS9/u48iYTyRriCa1Tu/TvjGOrFa+auAq+rEV1Iku6HxTdSZlRYRvwLOa6UPMzPLp2nikfQHwDeAn1I8oJ8q6RMRcXunB2dmZiNPlVtt1wIfiYgBAElvBf4JcOIxM7OWVZlcsK2WdJJNwLYOjcfMzEa4Ya94JJ2TdjdIWgksp3jGcx7FbDMzM7OWNbrV9oel/WeAD6f97cD41xc3MzNrbtjEExEX5ByImZmNDlVmtU0FPg1MKZePiLM6NywzMxupqsxq+x7F72++D7za2eGYmdlIVyXx/Coiru/4SMzMbFSoknj+u6SFwA+BXbVgRDR9J4+ZmdlQVRLPu4CPU7x4rXarLdKxmZlZS6okno8Cbym/GsHMzKxdVVYueAgY1+mBmJnZ6FDliudo4HFJa9n7GY+nU5uZWcuqJJ6FHR+FmZmNGlXex3N3joGYmdnoUGXlgp0Us9gADgIOBH4ZEYd1cmBmZjYyVbnieVP5WNLZ1Hnzp5mZWRVVZrXtJSK+h3/DY2ZmbWqaeCSdU9rOlXQ1r916a1RvsaRtkh4pxY6QtFrSxvQ5PsUl6XpJA5LWSzquVKcvld8oqa8UP17Sw6nO9ZLUbh9mZpZPlSuePyxtpwM7gdkV6t0EzBoSuwRYExHTgDXpGOAMYFra5gM3QJFEKGbVnUhxe29hLZGkMvNL9Wa104eZmeVV5RlPW+/liYgfSZoyJDwbOCntLwHuAi5O8aUREcC9ksZJmpDKro6IHQCSVgOzJN0FHBYR96T4UuBs4PZW+4iIre18PzMza0+jV19/sUG9iIgr2ujv6Nof+ojYKumoFJ8IbC6VG0yxRvHBOvF2+nhd4pE0n+KqiMmTJ7f4Fc3MrJFGt9p+WWcDmEdxBbEvqU4s2oi308frgxGLIqI3Inp7enqaNGtmZq1o9Orra2v7kt4EXARcACwDrh2uXhPP1G5vpVtp21J8EDimVG4SsCXFTxoSvyvFJ9Up304fZmaWUcPJBWmG2JXAeookdVxEXBwR2xrVa2AFUJuZ1gfcVorPTTPPZgIvpttlq4DTJI1PkwpOA1alczslzUyz2eYOaauVPszMLKNGz3j+GjgHWAS8KyJeaqVhSTdTXK0cKWmQYnba1cBySfOAp4HzUvGVwJnAAPAyxZUVEbFD0hXA2lTu8tpEA+BTFDPnDqWYVHB7irfUh5mZ5dVoVtvnKFaj/nPgz9LPZKB4VhLNlsyJiPOHOXVKnbIBLBimncXA4jrxfuDYOvHnWu3DzMzyafSMp+VVDczMzJpxcjEzs6yceMzMLCsnHjMzy8qJx8zMsnLiMTOzrJx4zMwsKyceMzPLyonHzMyycuIxM7OsnHjMzCwrJx4zM8vKicfMzLJy4jEzs6yceMzMLCsnHjMzy8qJx8zMsnLiMTOzrJx4zMwsq64kHklPSXpY0jpJ/Sl2hKTVkjamz/EpLknXSxqQtF7ScaV2+lL5jZL6SvHjU/sDqa4a9WFmZvl084rnIxExIyJ60/ElwJqImAasSccAZwDT0jYfuAGKJAIsBE4ETgAWlhLJDalsrd6sJn2YmVkm+9OtttnAkrS/BDi7FF8ahXuBcZImAKcDqyNiR0Q8D6wGZqVzh0XEPRERwNIhbdXrw8zMMulW4gngh5IekDQ/xY6OiK0A6fOoFJ8IbC7VHUyxRvHBOvFGfexF0nxJ/ZL6t2/f3uZXNDOzesZ2qd8PRsQWSUcBqyU93qCs6sSijXhlEbEIWATQ29vbUl0zM2usK1c8EbElfW4DbqV4RvNMuk1G+tyWig8Cx5SqTwK2NIlPqhOnQR9mZpZJ9sQj6Q2S3lTbB04DHgFWALWZaX3AbWl/BTA3zW6bCbyYbpOtAk6TND5NKjgNWJXO7ZQ0M81mmzukrXp9mJlZJt241XY0cGua4TwW+IeI+GdJa4HlkuYBTwPnpfIrgTOBAeBl4AKAiNgh6QpgbSp3eUTsSPufAm4CDgVuTxvA1cP0YWZmmWRPPBGxCXhPnfhzwCl14gEsGKatxcDiOvF+4NiqfZiZWT7703RqMzMbBZx4zMwsKyceMzPLyonHzMyycuIxM7OsnHjMzCwrJx4zM8vKicfMzLJy4jEzs6yceMzMLCsnHjMzy8qJx8zMsnLiMTOzrJx4zMwsKyceMzPLyonHzMyycuIxM7OsnHjMzCwrJx4zM8tqVCYeSbMkPSFpQNIl3R6PmdloMuoSj6QxwNeAM4DpwPmSpnd3VGZmo8eoSzzACcBARGyKiFeAZcDsLo/JzGzUGNvtAXTBRGBz6XgQOLFcQNJ8YH46fEnSE5nGNhocCTzb7UHsD3RNX7eHYHvzv82ahdoXrfzucCdGY+Kp91809jqIWAQsyjOc0UVSf0T0dnscZkP532Y+o/FW2yBwTOl4ErClS2MxMxt1RmPiWQtMkzRV0kHAHGBFl8dkZjZqjLpbbRGxW9KFwCpgDLA4IjZ0eVijiW9h2v7K/zYzUUQ0L2VmZraPjMZbbWZm1kVOPGZmlpUTj2XhZYpsfyVpsaRtkh7p9lhGCyce6zgvU2T7uZuAWd0exGjixGM5eJki229FxI+AHd0ex2jixGM51FumaGKXxmJmXebEYzk0XabIzEYPJx7LwcsUmdmvOfFYDl6myMx+zYnHOi4idgO1ZYoeA5Z7mSLbX0i6GbgH+D1Jg5LmdXtMI52XzDEzs6x8xWNmZlk58ZiZWVZOPGZmlpUTj5mZZeXEY2ZmWTnxmHWZpHGS/muGfk6S9IFO92PWjBOPWfeNAyonHhXa+X/3JMCJx7rOv+Mx6zJJtdW6nwDuBN4NjAcOBP48Im6TNAW4PZ1/P3A2cCpwMcXyQxuBXRFxoaQe4BvA5NTFZ4GfA/cCe4DtwKcj4n/n+H5mQznxmHVZSio/iIhjJY0F/l1E/ELSkRTJYhrwu8Am4AMRca+k3wH+DTgO2AncATyUEs8/AF+PiB9Lmgysioh3SPoS8FJEXJP7O5qVje32AMxsLwK+LOlDwKsUr484Op37WUTcm/ZPAO6OiB0Akr4DvD2dOxWYLv16UfDDJL0px+DNqnDiMdu/fAzoAY6PiP8n6SngkHTul6Vy9V41UXMA8P6I+L/lYCkRmXWVJxeYdd9OoHZFcjiwLSWdj1DcYqvnfuDDksan23P/qXTuhxSLsgIgaUadfsy6xonHrMsi4jngXyU9AswAeiX1U1z9PD5MnZ8DXwbuA/4FeBR4MZ3+TGpjvaRHgU+m+PeBj0paJ+k/duwLmTXhyQVmv6UkvTEiXkpXPLcCiyPi1m6Py6wZX/GY/fb6kqR1wCPAk8D3ujwes0p8xWNmZln5isfMzLJy4jEzs6yceMzMLCsnHjMzy8qJx8zMsvr/rtmRX9SpQdYAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Target 分布，看看各类样本分布是否均衡\n",
    "sns.countplot(data_all_train.target)\n",
    "plt.xlabel('target')\n",
    "plt.ylabel('Number of occurrences')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可见 正负样本分布均匀"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 特征工程可以做那些工作\n",
    "特征变换，这个是体力活\n",
    "1. 取对数log1p（对线性模型很重要，单调变换树模型影响不大）\n",
    "2. tf-idf\n",
    "3. 原始特征组合（加减乘除。如果是计数特征，乘法表示“and”，更有意义（FM）；或者可采用GBDT做特征编码，实现更高阶特征组合；原始特征维数太高，也可以先用基础模型得到特征的重要性，对重要的特征再组合）\n",
    "4. t-SNE及PCA降维后的特征 （降维部分讲解）\n",
    "5. 统计特征，如sum of the row, number of non-zero, max of the row，x-mean，个人感觉对这个数据集意义不大"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "29\n"
     ]
    }
   ],
   "source": [
    "# 统计年龄的平均值(向上取整)\n",
    "bd_mean = math.ceil(data_all_train[(data_all_train.bd < 100) & (data_all_train.bd > 9)].bd.mean())\n",
    "print(bd_mean)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 填充年龄的缺失值\n",
    "data_all_train['bd']=data_all_train.bd.fillna(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 年龄数据离散化\n",
    "def get_bd_section(bd, bd_mean_section):\n",
    "    if bd < 9 and bd > 100: # 所有的异常值都设为平均值所在的区间\n",
    "        bd = bd_mean_section\n",
    "    elif bd < 18:\n",
    "        bd = 1\n",
    "    elif bd < 26:\n",
    "        bd = 2\n",
    "    elif bd < 36:\n",
    "        bd = 3\n",
    "    elif bd < 46:\n",
    "        bd = 4\n",
    "    elif bd < 56:\n",
    "        bd = 5\n",
    "    elif bd < 66:\n",
    "        bd = 6\n",
    "    else:\n",
    "        bd = 7\n",
    "    return bd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>registration_init_time</th>\n",
       "      <th>expiration_date</th>\n",
       "      <th>song_length</th>\n",
       "      <th>genre_ids</th>\n",
       "      <th>artist_name</th>\n",
       "      <th>composer</th>\n",
       "      <th>lyricist</th>\n",
       "      <th>language</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>206471.0</td>\n",
       "      <td>359</td>\n",
       "      <td>Bastille</td>\n",
       "      <td>Dan Smith| Mark Crew</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Good Grief</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>284584.0</td>\n",
       "      <td>1259</td>\n",
       "      <td>Various Artists</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Lords of Cardboard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>225396.0</td>\n",
       "      <td>1259</td>\n",
       "      <td>Nas</td>\n",
       "      <td>N. Jones、W. Adams、J. Lordan、D. Ingle</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Hip Hop Is Dead(Album Version (Edited))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>24</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>255512.0</td>\n",
       "      <td>1019</td>\n",
       "      <td>Soundway</td>\n",
       "      <td>Kwadwo Donkoh</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>Disco Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>187802.0</td>\n",
       "      <td>1011</td>\n",
       "      <td>Brett Young</td>\n",
       "      <td>Brett Young| Kelly Archer| Justin Ebach</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Sleep Without You</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 msno  \\\n",
       "201969   FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1932462  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "183559   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "149511   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "74867    FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                              song_id source_system_tab  \\\n",
       "201969   BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
       "1932462  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
       "183559   JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
       "149511   2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
       "74867    3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
       "\n",
       "          source_screen_name      source_type  target  city  bd  gender  \\\n",
       "201969               Explore  online-playlist       1     1   0     NaN   \n",
       "1932462  Local playlist more   local-playlist       1    13  24  female   \n",
       "183559   Local playlist more   local-playlist       1    13  24  female   \n",
       "149511   Local playlist more   local-playlist       1    13  24  female   \n",
       "74867                Explore  online-playlist       1     1   0     NaN   \n",
       "\n",
       "         registered_via  registration_init_time  expiration_date  song_length  \\\n",
       "201969                7                20120102         20171005     206471.0   \n",
       "1932462               9                20110525         20170911     284584.0   \n",
       "183559                9                20110525         20170911     225396.0   \n",
       "149511                9                20110525         20170911     255512.0   \n",
       "74867                 7                20120102         20171005     187802.0   \n",
       "\n",
       "        genre_ids      artist_name                                 composer  \\\n",
       "201969        359         Bastille                     Dan Smith| Mark Crew   \n",
       "1932462      1259  Various Artists                                      NaN   \n",
       "183559       1259              Nas     N. Jones、W. Adams、J. Lordan、D. Ingle   \n",
       "149511       1019         Soundway                            Kwadwo Donkoh   \n",
       "74867        1011      Brett Young  Brett Young| Kelly Archer| Justin Ebach   \n",
       "\n",
       "        lyricist  language                                     name  \n",
       "201969       NaN      52.0                               Good Grief  \n",
       "1932462      NaN      52.0                       Lords of Cardboard  \n",
       "183559       NaN      52.0  Hip Hop Is Dead(Album Version (Edited))  \n",
       "149511       NaN      -1.0                             Disco Africa  \n",
       "74867        NaN      52.0                        Sleep Without You  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(type(data_all_train))\n",
    "data_all_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>registration_init_time</th>\n",
       "      <th>expiration_date</th>\n",
       "      <th>song_length</th>\n",
       "      <th>genre_ids</th>\n",
       "      <th>artist_name</th>\n",
       "      <th>composer</th>\n",
       "      <th>lyricist</th>\n",
       "      <th>language</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>206471.0</td>\n",
       "      <td>359</td>\n",
       "      <td>Bastille</td>\n",
       "      <td>Dan Smith| Mark Crew</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Good Grief</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>284584.0</td>\n",
       "      <td>1259</td>\n",
       "      <td>Various Artists</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Lords of Cardboard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>225396.0</td>\n",
       "      <td>1259</td>\n",
       "      <td>Nas</td>\n",
       "      <td>N. Jones、W. Adams、J. Lordan、D. Ingle</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Hip Hop Is Dead(Album Version (Edited))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>255512.0</td>\n",
       "      <td>1019</td>\n",
       "      <td>Soundway</td>\n",
       "      <td>Kwadwo Donkoh</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>Disco Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>187802.0</td>\n",
       "      <td>1011</td>\n",
       "      <td>Brett Young</td>\n",
       "      <td>Brett Young| Kelly Archer| Justin Ebach</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Sleep Without You</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 msno  \\\n",
       "201969   FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1932462  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "183559   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "149511   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "74867    FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                              song_id source_system_tab  \\\n",
       "201969   BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
       "1932462  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
       "183559   JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
       "149511   2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
       "74867    3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
       "\n",
       "          source_screen_name      source_type  target  city  bd  gender  \\\n",
       "201969               Explore  online-playlist       1     1   1     NaN   \n",
       "1932462  Local playlist more   local-playlist       1    13   2  female   \n",
       "183559   Local playlist more   local-playlist       1    13   2  female   \n",
       "149511   Local playlist more   local-playlist       1    13   2  female   \n",
       "74867                Explore  online-playlist       1     1   1     NaN   \n",
       "\n",
       "         registered_via  registration_init_time  expiration_date  song_length  \\\n",
       "201969                7                20120102         20171005     206471.0   \n",
       "1932462               9                20110525         20170911     284584.0   \n",
       "183559                9                20110525         20170911     225396.0   \n",
       "149511                9                20110525         20170911     255512.0   \n",
       "74867                 7                20120102         20171005     187802.0   \n",
       "\n",
       "        genre_ids      artist_name                                 composer  \\\n",
       "201969        359         Bastille                     Dan Smith| Mark Crew   \n",
       "1932462      1259  Various Artists                                      NaN   \n",
       "183559       1259              Nas     N. Jones、W. Adams、J. Lordan、D. Ingle   \n",
       "149511       1019         Soundway                            Kwadwo Donkoh   \n",
       "74867        1011      Brett Young  Brett Young| Kelly Archer| Justin Ebach   \n",
       "\n",
       "        lyricist  language                                     name  \n",
       "201969       NaN      52.0                               Good Grief  \n",
       "1932462      NaN      52.0                       Lords of Cardboard  \n",
       "183559       NaN      52.0  Hip Hop Is Dead(Album Version (Edited))  \n",
       "149511       NaN      -1.0                             Disco Africa  \n",
       "74867        NaN      52.0                        Sleep Without You  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 将 bd 特征中的离群值都设置为中位数\n",
    "bd_mean_section = 3\n",
    "data_all_train['bd'] = data_all_train.bd.apply(lambda x: get_bd_section(x, bd_mean_section))\n",
    "data_all_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f915f236128>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAD4CAYAAADFAawfAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAAJtklEQVR4nO3dXYim91nH8d/VbIpJjSmaQcW6GV8gFKptyiCUYNFUS2pqD8SDBhQVYU5EWhTseiQ9iye+HIiw1pdAakVjc5JgqaJBIhqZTdOQdiNiTWh8ywSxSbXYFy8PdpJsNrOdZ7pzz1w78/nAwz4z93/vuY6+e3M//3unujsAzPWaox4AgK9OqAGGE2qA4YQaYDihBhju1BInvemmm3p9fX2JUwMcS+fOnXuuu9d2O7ZIqNfX17O1tbXEqQGOpap6+nLH3PoAGE6oAYYTaoDhhBpgOKEGGG7PUFfVLVX12EWv56vq/YcxHByUqnrVC64We27P6+5/SPKWJKmqa5L8S5L7F54LDszlolxV8b9HcjXY7z7qdyT5p+6+7H4/mOriKLui5mqy33vU703ykd0OVNVmVW1V1db29vaVTwZAkn2Euqpem+Q9Sf5kt+Pdfba7N7p7Y21t16cgAfga7OfWx7uSPNrd/7HUMLAktzu4Wu3n1sdducxtD5jsch8Y+iCRq8VKoa6q65P8cJKPLjsOLKO7X/WCq8VKtz66+3+SfNPCswCwC08mAgwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMNxKoa6q11fVfVX1ZFWdr6q3LT0YABecWnHdbyb5WHf/eFW9Nsn1C84EwEX2DHVVfUOStyf56STp7i8m+eKyYwHwolVufXxnku0kv19Vn6iqD1XV6y5dVFWbVbVVVVvb29sHPihcqqoO7QVHaZVQn0ry1iS/3d23JvnvJGcuXdTdZ7t7o7s31tbWDnhMeLXu3vfr5g888DX9PThKq4T6mSTPdPcjO1/flwvhBuAQ7Bnq7v73JJ+tqlt2vvWOJJ9edCoAXrLqro+fT/LhnR0fn0nyM8uNBMDFVgp1dz+WZGPhWQDYhScTAYYTaoDhhBpgOKEGGE6oAYYTaoDhhBpgOKEGGE6oAYYTaoDhhBpgOKEGGE6oAYYTaoDhhBpgOKEGGE6oAYYTaoDhhBpgOKEGGE6oAYYTaoDhhBpgOKEGGO7UKouq6qkkLyT5SpIvd/fGkkMB8LKVQr3jB7v7ucUmAWBXbn0ADLdqqDvJx6vqXFVt7ragqjaraquqtra3tw9uQoATbtVQ39bdb03yriQ/V1Vvv3RBd5/t7o3u3lhbWzvQIQFOspVC3d3/uvPns0nuT/J9Sw4FwMv2DHVVva6qbnjxfZJ3Jnli6cEAuGCVXR/fnOT+qnpx/R9298cWnQqAl+wZ6u7+TJI3H8IsAOzC9jyA4YQaYDihBhhOqAGGE2qA4YQaYDihBhhOqAGGE2qA4YQaYDihBhhOqAGGE2qA4YQaYDihBhhOqAGGE2qA4YQaYDihBhhOqAGGE2qA4YQaYDihBhhOqAGGWznUVXVNVX2iqh5YciAAXmk/V9TvS3J+qUEA2N1Koa6qNyS5M8mHlh0HgEutekX9G0l+Kcn/XW5BVW1W1VZVbW1vbx/IcACsEOqqeneSZ7v73Fdb191nu3ujuzfW1tYObECAk26VK+rbkrynqp5K8kdJbq+qexedCoCX7Bnq7v7l7n5Dd68neW+Sv+zun1h8MgCS2EcNMN6p/Szu7oeSPLTIJADsyhU1wHBCDTCcUAMMJ9QAwwk1wHBCDTCcUAMMJ9QAwwk1wHBCDTCcUAMMJ9QAwwk1wHBCDTCcUAMMJ9QAwwk1wHBCDTCcUAMMJ9QAwwk1wHBCDTCcUAMMJ9QAw+0Z6qr6uqr6+6r6ZFV9qqo+eBiDAXDBqRXW/G+S27v781V1bZKHq+rPuvvvFp4NgKwQ6u7uJJ/f+fLanVcvORQAL1vlijpVdU2Sc0m+O8lvdfcju6zZTLKZJKdPnz7IGTkh3vzBj+dzX/jS4j9n/cyDi57/xuuuzSd/5Z2L/gxOlpVC3d1fSfKWqnp9kvur6k3d/cQla84mOZskGxsbrrjZt8994Ut56u47j3qMK7b0PwScPPva9dHd/5XkoSR3LDINAK+yyq6PtZ0r6VTVdUl+KMmTSw8GwAWr3Pr41iT37Nynfk2SP+7uB5YdC4AXrbLr4/Ektx7CLADswpOJAMMJNcBwQg0wnFADDCfUAMMJNcBwQg0wnFADDCfUAMMJNcBwQg0wnFADDCfUAMMJNcBwQg0wnFADDCfUAMMJNcBwQg0wnFADDCfUAMMJNcBwQg0wnFADDLdnqKvq26vqr6rqfFV9qqredxiDAXDBqRXWfDnJL3b3o1V1Q5JzVfXn3f3phWcDICtcUXf3v3X3ozvvX0hyPsm3LT0YABesckX9kqpaT3Jrkkd2ObaZZDNJTp8+fQCjcdLc8MYz+Z57zhz1GFfshjcmyZ1HPQbHyMqhrqqvT/KnSd7f3c9fery7zyY5myQbGxt9YBNyYrxw/u48dffVH7j1Mw8e9QgcMyvt+qiqa3Mh0h/u7o8uOxIAF1tl10cl+d0k57v715YfCYCLrXJFfVuSn0xye1U9tvP6kYXnAmDHnveou/vhJHUIswCwC08mAgwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMJxQAwwn1ADDCTXAcEINMNyeoa6q36uqZ6vqicMYCIBXWuWK+g+S3LHwHABcxp6h7u6/TvKfhzALALs4dVAnqqrNJJtJcvr06YM6LSfM+pkHV1779K++e8FJXunmDzyw8tobr7t2wUk4iaq7915UtZ7kge5+0yon3djY6K2trSubDOAEqapz3b2x2zG7PgCGE2qA4VbZnveRJH+b5Jaqeqaqfnb5sQB40Z4fJnb3XYcxCAC7c+sDYDihBhhOqAGGE2qA4VZ64GXfJ63aTvL0gZ8YrtxNSZ476iFgFzd399puBxYJNUxVVVuXe/oLpnLrA2A4oQYYTqg5ac4e9QCwX+5RAwznihpgOKEGGE6oObaqan2vX8pcVT9QVav/+hY4AkINMJxQc9ydqqp7qurxqrqvqq6vqjuq6smqejjJjx31gLAXoea4uyXJ2e7+3iTPJ/mFJL+T5EeTfH+SbznC2WAlQs1x99nu/pud9/cm2Ujyz939j31hb+q9RzcarEaoOe4ufVDgxl2+B6MJNcfd6ap62877u5L8RZLvqKrvuuh7MJpQc9ydT/JTVfV4km9M8utJNpM8uPNhov+Ol/E8Qg4wnCtqgOGEGmA4oQYYTqgBhhNqgOGEGmA4oQYY7v8BKaAlEVqg06MAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "data_all_train.bd.plot.box()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 歌曲长度离散化，单位 ms\n",
    "def get_song_length_section(song_length):\n",
    "    if song_length < 1000*60*5:\n",
    "        return 1\n",
    "    elif song_length < 1000*60*10:\n",
    "        return 2\n",
    "    elif song_length < 1000*60*20:\n",
    "        return 3\n",
    "    elif song_length < 1000*60*30:\n",
    "        return 4\n",
    "    elif song_length < 1000*60*40:\n",
    "        return 5\n",
    "    elif song_length < 1000*60*50:\n",
    "        return 6\n",
    "    elif song_length < 1000*60*60:\n",
    "        return 7\n",
    "    else: \n",
    "        return 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>registration_init_time</th>\n",
       "      <th>expiration_date</th>\n",
       "      <th>song_length</th>\n",
       "      <th>genre_ids</th>\n",
       "      <th>artist_name</th>\n",
       "      <th>composer</th>\n",
       "      <th>lyricist</th>\n",
       "      <th>language</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>1</td>\n",
       "      <td>359</td>\n",
       "      <td>Bastille</td>\n",
       "      <td>Dan Smith| Mark Crew</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Good Grief</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>1</td>\n",
       "      <td>1259</td>\n",
       "      <td>Various Artists</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Lords of Cardboard</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>1</td>\n",
       "      <td>1259</td>\n",
       "      <td>Nas</td>\n",
       "      <td>N. Jones、W. Adams、J. Lordan、D. Ingle</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Hip Hop Is Dead(Album Version (Edited))</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>1</td>\n",
       "      <td>1019</td>\n",
       "      <td>Soundway</td>\n",
       "      <td>Kwadwo Donkoh</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>Disco Africa</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>1</td>\n",
       "      <td>1011</td>\n",
       "      <td>Brett Young</td>\n",
       "      <td>Brett Young| Kelly Archer| Justin Ebach</td>\n",
       "      <td>NaN</td>\n",
       "      <td>52.0</td>\n",
       "      <td>Sleep Without You</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 msno  \\\n",
       "201969   FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1932462  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "183559   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "149511   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "74867    FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                              song_id source_system_tab  \\\n",
       "201969   BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
       "1932462  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
       "183559   JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
       "149511   2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
       "74867    3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
       "\n",
       "          source_screen_name      source_type  target  city  bd  gender  \\\n",
       "201969               Explore  online-playlist       1     1   1     NaN   \n",
       "1932462  Local playlist more   local-playlist       1    13   2  female   \n",
       "183559   Local playlist more   local-playlist       1    13   2  female   \n",
       "149511   Local playlist more   local-playlist       1    13   2  female   \n",
       "74867                Explore  online-playlist       1     1   1     NaN   \n",
       "\n",
       "         registered_via  registration_init_time  expiration_date  song_length  \\\n",
       "201969                7                20120102         20171005            1   \n",
       "1932462               9                20110525         20170911            1   \n",
       "183559                9                20110525         20170911            1   \n",
       "149511                9                20110525         20170911            1   \n",
       "74867                 7                20120102         20171005            1   \n",
       "\n",
       "        genre_ids      artist_name                                 composer  \\\n",
       "201969        359         Bastille                     Dan Smith| Mark Crew   \n",
       "1932462      1259  Various Artists                                      NaN   \n",
       "183559       1259              Nas     N. Jones、W. Adams、J. Lordan、D. Ingle   \n",
       "149511       1019         Soundway                            Kwadwo Donkoh   \n",
       "74867        1011      Brett Young  Brett Young| Kelly Archer| Justin Ebach   \n",
       "\n",
       "        lyricist  language                                     name  \n",
       "201969       NaN      52.0                               Good Grief  \n",
       "1932462      NaN      52.0                       Lords of Cardboard  \n",
       "183559       NaN      52.0  Hip Hop Is Dead(Album Version (Edited))  \n",
       "149511       NaN      -1.0                             Disco Africa  \n",
       "74867        NaN      52.0                        Sleep Without You  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train['song_length'] = data_all_train.song_length.fillna(0)\n",
    "data_all_train['song_length'] = data_all_train.apply(lambda x: get_song_length_section(x.song_length), axis=1)\n",
    "data_all_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    6693269\n",
       "2     670395\n",
       "3      11094\n",
       "4       1589\n",
       "5        400\n",
       "8        349\n",
       "7        210\n",
       "6        112\n",
       "Name: song_length, dtype: int64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.song_length.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "572"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data_all_train.genre_ids.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "465                   3717690\n",
       "458                   1261208\n",
       "921                    350311\n",
       "1609                   315089\n",
       "444                    243600\n",
       "1259                   181713\n",
       "2022                   152545\n",
       "359                    109513\n",
       "2122                    71859\n",
       "139                     65827\n",
       "451                     63818\n",
       "437                     58530\n",
       "958                     56125\n",
       "786                     43309\n",
       "1616|1609               42306\n",
       "465|1259                35352\n",
       "1011                    33496\n",
       "444|1259                32610\n",
       "921|465                 29746\n",
       "139|125|109             28031\n",
       "2157                    26430\n",
       "726                     21807\n",
       "921|458                 19760\n",
       "465|458                 16925\n",
       "947                     15516\n",
       "691                     15469\n",
       "786|947                 13656\n",
       "1616                    11473\n",
       "465|2022                10293\n",
       "1152                    10055\n",
       "                       ...   \n",
       "1096|958                    1\n",
       "1969|444|2100               1\n",
       "1011|359                    1\n",
       "1609|2122|786               1\n",
       "751                         1\n",
       "1969|275|2100|1572          1\n",
       "465|2022|359                1\n",
       "1152|2122|786|947           1\n",
       "109|94                      1\n",
       "1616|2109                   1\n",
       "1944|310                    1\n",
       "458|2130                    1\n",
       "465|388|958                 1\n",
       "921|451                     1\n",
       "1152|465|958                1\n",
       "1180|437                    1\n",
       "430|1011                    1\n",
       "516|465                     1\n",
       "444|359                     1\n",
       "465|2213|2215               1\n",
       "880|465                     1\n",
       "465|2130|139                1\n",
       "388|1011                    1\n",
       "2015                        1\n",
       "921|1633                    1\n",
       "338                         1\n",
       "940|726                     1\n",
       "829|822                     1\n",
       "388|509                     1\n",
       "437|2022                    1\n",
       "Name: genre_ids, Length: 572, dtype: int64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.genre_ids.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 集合列表之间相互转换\n",
    "#list(set('880|465'.split('|')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def add_ids_2_set(genre_ids, id_set):\n",
    "    genre_ids = str(genre_ids)\n",
    "    id_arr = genre_ids.split('|')\n",
    "    for item in id_arr:\n",
    "        id_set.add(item)\n",
    "    return"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "201969     None\n",
       "1932462    None\n",
       "183559     None\n",
       "149511     None\n",
       "74867      None\n",
       "1674672    None\n",
       "1240481    None\n",
       "157817     None\n",
       "183201     None\n",
       "1552643    None\n",
       "61001      None\n",
       "527775     None\n",
       "12046      None\n",
       "239362     None\n",
       "1781569    None\n",
       "90840      None\n",
       "1563342    None\n",
       "868444     None\n",
       "314283     None\n",
       "279520     None\n",
       "403930     None\n",
       "76149      None\n",
       "131015     None\n",
       "1175447    None\n",
       "99623      None\n",
       "569498     None\n",
       "1020347    None\n",
       "371818     None\n",
       "953615     None\n",
       "527790     None\n",
       "           ... \n",
       "370560     None\n",
       "120278     None\n",
       "1781568    None\n",
       "341069     None\n",
       "341300     None\n",
       "200962     None\n",
       "97581      None\n",
       "362521     None\n",
       "374494     None\n",
       "1338       None\n",
       "121619     None\n",
       "1397378    None\n",
       "26653      None\n",
       "14746      None\n",
       "1785602    None\n",
       "76167      None\n",
       "112282     None\n",
       "522420     None\n",
       "9442       None\n",
       "40007      None\n",
       "291873     None\n",
       "557723     None\n",
       "339747     None\n",
       "403918     None\n",
       "2711       None\n",
       "319702     None\n",
       "403922     None\n",
       "217948     None\n",
       "292315     None\n",
       "109449     None\n",
       "Name: genre_ids, Length: 7377418, dtype: object"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "id_set = set()\n",
    "data_all_train.genre_ids.apply(lambda ids: add_ids_2_set(ids,id_set))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "167\n"
     ]
    }
   ],
   "source": [
    "genre_id_list = list(id_set)\n",
    "genre_id_len = len(genre_id_list)\n",
    "print(len(genre_id_list))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 建立 genre_id 索引\n",
    "genre_id_index = dict()\n",
    "for i,genre_id in enumerate(genre_id_list):\n",
    "    genre_id_index[genre_id] = i"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'2093': 0,\n",
       " '1572': 1,\n",
       " '2122': 2,\n",
       " '843': 3,\n",
       " '502': 4,\n",
       " '87': 5,\n",
       " '2130': 6,\n",
       " '1124': 7,\n",
       " '388': 8,\n",
       " '677': 9,\n",
       " '2052': 10,\n",
       " '822': 11,\n",
       " '252': 12,\n",
       " '850': 13,\n",
       " '458': 14,\n",
       " '1259': 15,\n",
       " '1995': 16,\n",
       " '698': 17,\n",
       " '2079': 18,\n",
       " '444': 19,\n",
       " '2100': 20,\n",
       " '125': 21,\n",
       " '1965': 22,\n",
       " '2144': 23,\n",
       " '516': 24,\n",
       " '1630': 25,\n",
       " '2189': 26,\n",
       " '1273': 27,\n",
       " '275': 28,\n",
       " '829': 29,\n",
       " '242': 30,\n",
       " '2194': 31,\n",
       " '2150': 32,\n",
       " '958': 33,\n",
       " '1598': 34,\n",
       " '726': 35,\n",
       " '2022': 36,\n",
       " '880': 37,\n",
       " '509': 38,\n",
       " '1103': 39,\n",
       " '1033': 40,\n",
       " '965': 41,\n",
       " '152': 42,\n",
       " '779': 43,\n",
       " '1944': 44,\n",
       " '1054': 45,\n",
       " '2072': 46,\n",
       " '1605': 47,\n",
       " '712': 48,\n",
       " '2029': 49,\n",
       " '972': 50,\n",
       " '481': 51,\n",
       " '1266': 52,\n",
       " '1187': 53,\n",
       " '649': 54,\n",
       " '1152': 55,\n",
       " '2176': 56,\n",
       " '1969': 57,\n",
       " '2086': 58,\n",
       " '331': 59,\n",
       " '374': 60,\n",
       " '474': 61,\n",
       " '691': 62,\n",
       " '1047': 63,\n",
       " '282': 64,\n",
       " '359': 65,\n",
       " '367': 66,\n",
       " '1579': 67,\n",
       " '338': 68,\n",
       " '416': 69,\n",
       " '907': 70,\n",
       " '381': 71,\n",
       " '402': 72,\n",
       " '1096': 73,\n",
       " '184': 74,\n",
       " '1011': 75,\n",
       " '857': 76,\n",
       " '2008': 77,\n",
       " '921': 78,\n",
       " '2127': 79,\n",
       " '1955': 80,\n",
       " '212': 81,\n",
       " '2183': 82,\n",
       " '2116': 83,\n",
       " '1117': 84,\n",
       " '2172': 85,\n",
       " '2157': 86,\n",
       " '786': 87,\n",
       " '947': 88,\n",
       " '1068': 89,\n",
       " '2032': 90,\n",
       " '1040': 91,\n",
       " '986': 92,\n",
       " '2215': 93,\n",
       " '1026': 94,\n",
       " '2192': 95,\n",
       " '1131': 96,\n",
       " '1145': 97,\n",
       " '423': 98,\n",
       " '1082': 99,\n",
       " '310': 100,\n",
       " '940': 101,\n",
       " '993': 102,\n",
       " '1019': 103,\n",
       " '465': 104,\n",
       " '1194': 105,\n",
       " '900': 106,\n",
       " '1633': 107,\n",
       " '1287': 108,\n",
       " '118': 109,\n",
       " '1988': 110,\n",
       " 'nan': 111,\n",
       " '864': 112,\n",
       " '2107': 113,\n",
       " '94': 114,\n",
       " '1208': 115,\n",
       " '2248': 116,\n",
       " '798': 117,\n",
       " '744': 118,\n",
       " '531': 119,\n",
       " '205': 120,\n",
       " '1169': 121,\n",
       " '2058': 122,\n",
       " '430': 123,\n",
       " '437': 124,\n",
       " '2245': 125,\n",
       " '1138': 126,\n",
       " '139': 127,\n",
       " '2109': 128,\n",
       " '1616': 129,\n",
       " '488': 130,\n",
       " '1007': 131,\n",
       " '1201': 132,\n",
       " '109': 133,\n",
       " '2213': 134,\n",
       " '1162': 135,\n",
       " '1977': 136,\n",
       " '751': 137,\n",
       " '102': 138,\n",
       " '95': 139,\n",
       " '545': 140,\n",
       " '2015': 141,\n",
       " '451': 142,\n",
       " '296': 143,\n",
       " '719': 144,\n",
       " '409': 145,\n",
       " '893': 146,\n",
       " '979': 147,\n",
       " '1280': 148,\n",
       " '1180': 149,\n",
       " '198': 150,\n",
       " '2219': 151,\n",
       " '1568': 152,\n",
       " '1000': 153,\n",
       " '191': 154,\n",
       " '656': 155,\n",
       " '1609': 156,\n",
       " '2206': 157,\n",
       " '352': 158,\n",
       " '2065': 159,\n",
       " '1155': 160,\n",
       " '1981': 161,\n",
       " '873': 162,\n",
       " '1110': 163,\n",
       " '177': 164,\n",
       " '808': 165,\n",
       " '670': 166}"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genre_id_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "7377418"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# range(len(genre_id_index))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_genre_id_matrix(data_all_train,genre_id_index):    \n",
    "    genre_id_matrix = ss.dok_matrix((data_all_train.shape[0], genre_id_len), dtype='uint8')\n",
    "    for df_index in range(data_all_train.shape[0]):\n",
    "        line = data_all_train.iloc[df_index, :]\n",
    "        for item in str(line.genre_ids).split('|'):\n",
    "             id_index = genre_id_index[item]\n",
    "             genre_id_matrix[df_index,id_index] = 1\n",
    "    return genre_id_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "genre_id_matrix = get_genre_id_matrix(data_all_train,genre_id_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['genre_id_0',\n",
       " 'genre_id_1',\n",
       " 'genre_id_2',\n",
       " 'genre_id_3',\n",
       " 'genre_id_4',\n",
       " 'genre_id_5',\n",
       " 'genre_id_6',\n",
       " 'genre_id_7',\n",
       " 'genre_id_8',\n",
       " 'genre_id_9',\n",
       " 'genre_id_10',\n",
       " 'genre_id_11',\n",
       " 'genre_id_12',\n",
       " 'genre_id_13',\n",
       " 'genre_id_14',\n",
       " 'genre_id_15',\n",
       " 'genre_id_16',\n",
       " 'genre_id_17',\n",
       " 'genre_id_18',\n",
       " 'genre_id_19',\n",
       " 'genre_id_20',\n",
       " 'genre_id_21',\n",
       " 'genre_id_22',\n",
       " 'genre_id_23',\n",
       " 'genre_id_24',\n",
       " 'genre_id_25',\n",
       " 'genre_id_26',\n",
       " 'genre_id_27',\n",
       " 'genre_id_28',\n",
       " 'genre_id_29',\n",
       " 'genre_id_30',\n",
       " 'genre_id_31',\n",
       " 'genre_id_32',\n",
       " 'genre_id_33',\n",
       " 'genre_id_34',\n",
       " 'genre_id_35',\n",
       " 'genre_id_36',\n",
       " 'genre_id_37',\n",
       " 'genre_id_38',\n",
       " 'genre_id_39',\n",
       " 'genre_id_40',\n",
       " 'genre_id_41',\n",
       " 'genre_id_42',\n",
       " 'genre_id_43',\n",
       " 'genre_id_44',\n",
       " 'genre_id_45',\n",
       " 'genre_id_46',\n",
       " 'genre_id_47',\n",
       " 'genre_id_48',\n",
       " 'genre_id_49',\n",
       " 'genre_id_50',\n",
       " 'genre_id_51',\n",
       " 'genre_id_52',\n",
       " 'genre_id_53',\n",
       " 'genre_id_54',\n",
       " 'genre_id_55',\n",
       " 'genre_id_56',\n",
       " 'genre_id_57',\n",
       " 'genre_id_58',\n",
       " 'genre_id_59',\n",
       " 'genre_id_60',\n",
       " 'genre_id_61',\n",
       " 'genre_id_62',\n",
       " 'genre_id_63',\n",
       " 'genre_id_64',\n",
       " 'genre_id_65',\n",
       " 'genre_id_66',\n",
       " 'genre_id_67',\n",
       " 'genre_id_68',\n",
       " 'genre_id_69',\n",
       " 'genre_id_70',\n",
       " 'genre_id_71',\n",
       " 'genre_id_72',\n",
       " 'genre_id_73',\n",
       " 'genre_id_74',\n",
       " 'genre_id_75',\n",
       " 'genre_id_76',\n",
       " 'genre_id_77',\n",
       " 'genre_id_78',\n",
       " 'genre_id_79',\n",
       " 'genre_id_80',\n",
       " 'genre_id_81',\n",
       " 'genre_id_82',\n",
       " 'genre_id_83',\n",
       " 'genre_id_84',\n",
       " 'genre_id_85',\n",
       " 'genre_id_86',\n",
       " 'genre_id_87',\n",
       " 'genre_id_88',\n",
       " 'genre_id_89',\n",
       " 'genre_id_90',\n",
       " 'genre_id_91',\n",
       " 'genre_id_92',\n",
       " 'genre_id_93',\n",
       " 'genre_id_94',\n",
       " 'genre_id_95',\n",
       " 'genre_id_96',\n",
       " 'genre_id_97',\n",
       " 'genre_id_98',\n",
       " 'genre_id_99',\n",
       " 'genre_id_100',\n",
       " 'genre_id_101',\n",
       " 'genre_id_102',\n",
       " 'genre_id_103',\n",
       " 'genre_id_104',\n",
       " 'genre_id_105',\n",
       " 'genre_id_106',\n",
       " 'genre_id_107',\n",
       " 'genre_id_108',\n",
       " 'genre_id_109',\n",
       " 'genre_id_110',\n",
       " 'genre_id_111',\n",
       " 'genre_id_112',\n",
       " 'genre_id_113',\n",
       " 'genre_id_114',\n",
       " 'genre_id_115',\n",
       " 'genre_id_116',\n",
       " 'genre_id_117',\n",
       " 'genre_id_118',\n",
       " 'genre_id_119',\n",
       " 'genre_id_120',\n",
       " 'genre_id_121',\n",
       " 'genre_id_122',\n",
       " 'genre_id_123',\n",
       " 'genre_id_124',\n",
       " 'genre_id_125',\n",
       " 'genre_id_126',\n",
       " 'genre_id_127',\n",
       " 'genre_id_128',\n",
       " 'genre_id_129',\n",
       " 'genre_id_130',\n",
       " 'genre_id_131',\n",
       " 'genre_id_132',\n",
       " 'genre_id_133',\n",
       " 'genre_id_134',\n",
       " 'genre_id_135',\n",
       " 'genre_id_136',\n",
       " 'genre_id_137',\n",
       " 'genre_id_138',\n",
       " 'genre_id_139',\n",
       " 'genre_id_140',\n",
       " 'genre_id_141',\n",
       " 'genre_id_142',\n",
       " 'genre_id_143',\n",
       " 'genre_id_144',\n",
       " 'genre_id_145',\n",
       " 'genre_id_146',\n",
       " 'genre_id_147',\n",
       " 'genre_id_148',\n",
       " 'genre_id_149',\n",
       " 'genre_id_150',\n",
       " 'genre_id_151',\n",
       " 'genre_id_152',\n",
       " 'genre_id_153',\n",
       " 'genre_id_154',\n",
       " 'genre_id_155',\n",
       " 'genre_id_156',\n",
       " 'genre_id_157',\n",
       " 'genre_id_158',\n",
       " 'genre_id_159',\n",
       " 'genre_id_160',\n",
       " 'genre_id_161',\n",
       " 'genre_id_162',\n",
       " 'genre_id_163',\n",
       " 'genre_id_164',\n",
       " 'genre_id_165',\n",
       " 'genre_id_166']"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 生成 genre_id 在 dataframe 中的列名\n",
    "genre_id_columns = list()\n",
    "for i in range(genre_id_len):\n",
    "    genre_id_columns.append('genre_id_'+str(i))\n",
    "genre_id_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  (0, 65)\t1\n",
      "  (1, 15)\t1\n",
      "  (2, 15)\t1\n",
      "  (3, 103)\t1\n",
      "  (4, 75)\t1\n",
      "  (5, 15)\t1\n",
      "  (6, 104)\t1\n",
      "  (7, 75)\t1\n",
      "  (8, 36)\t1\n",
      "  (9, 104)\t1\n",
      "  (10, 104)\t1\n",
      "  (11, 104)\t1\n",
      "  (12, 104)\t1\n",
      "  (13, 14)\t1\n",
      "  (14, 104)\t1\n",
      "  (14, 14)\t1\n",
      "  (15, 104)\t1\n",
      "  (16, 104)\t1\n",
      "  (17, 14)\t1\n",
      "  (18, 104)\t1\n",
      "  (19, 104)\t1\n",
      "  (20, 14)\t1\n",
      "  (21, 104)\t1\n",
      "  (22, 104)\t1\n",
      "  (23, 36)\t1\n",
      "  :\t:\n",
      "  (7377393, 104)\t1\n",
      "  (7377394, 104)\t1\n",
      "  (7377395, 14)\t1\n",
      "  (7377396, 104)\t1\n",
      "  (7377397, 104)\t1\n",
      "  (7377398, 104)\t1\n",
      "  (7377399, 104)\t1\n",
      "  (7377400, 104)\t1\n",
      "  (7377401, 104)\t1\n",
      "  (7377402, 156)\t1\n",
      "  (7377403, 156)\t1\n",
      "  (7377404, 156)\t1\n",
      "  (7377405, 65)\t1\n",
      "  (7377406, 87)\t1\n",
      "  (7377407, 14)\t1\n",
      "  (7377408, 78)\t1\n",
      "  (7377409, 78)\t1\n",
      "  (7377410, 104)\t1\n",
      "  (7377411, 104)\t1\n",
      "  (7377412, 104)\t1\n",
      "  (7377413, 15)\t1\n",
      "  (7377414, 104)\t1\n",
      "  (7377415, 104)\t1\n",
      "  (7377416, 2)\t1\n",
      "  (7377417, 81)\t1\n"
     ]
    }
   ],
   "source": [
    "print(genre_id_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(7377418, 167)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>genre_id_0</th>\n",
       "      <th>genre_id_1</th>\n",
       "      <th>genre_id_2</th>\n",
       "      <th>genre_id_3</th>\n",
       "      <th>genre_id_4</th>\n",
       "      <th>genre_id_5</th>\n",
       "      <th>genre_id_6</th>\n",
       "      <th>genre_id_7</th>\n",
       "      <th>genre_id_8</th>\n",
       "      <th>genre_id_9</th>\n",
       "      <th>...</th>\n",
       "      <th>genre_id_157</th>\n",
       "      <th>genre_id_158</th>\n",
       "      <th>genre_id_159</th>\n",
       "      <th>genre_id_160</th>\n",
       "      <th>genre_id_161</th>\n",
       "      <th>genre_id_162</th>\n",
       "      <th>genre_id_163</th>\n",
       "      <th>genre_id_164</th>\n",
       "      <th>genre_id_165</th>\n",
       "      <th>genre_id_166</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 167 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         genre_id_0  genre_id_1  genre_id_2  genre_id_3  genre_id_4  \\\n",
       "201969            0           0           0           0           0   \n",
       "1932462           0           0           0           0           0   \n",
       "183559            0           0           0           0           0   \n",
       "149511            0           0           0           0           0   \n",
       "74867             0           0           0           0           0   \n",
       "\n",
       "         genre_id_5  genre_id_6  genre_id_7  genre_id_8  genre_id_9  ...  \\\n",
       "201969            0           0           0           0           0  ...   \n",
       "1932462           0           0           0           0           0  ...   \n",
       "183559            0           0           0           0           0  ...   \n",
       "149511            0           0           0           0           0  ...   \n",
       "74867             0           0           0           0           0  ...   \n",
       "\n",
       "         genre_id_157  genre_id_158  genre_id_159  genre_id_160  genre_id_161  \\\n",
       "201969              0             0             0             0             0   \n",
       "1932462             0             0             0             0             0   \n",
       "183559              0             0             0             0             0   \n",
       "149511              0             0             0             0             0   \n",
       "74867               0             0             0             0             0   \n",
       "\n",
       "         genre_id_162  genre_id_163  genre_id_164  genre_id_165  genre_id_166  \n",
       "201969              0             0             0             0             0  \n",
       "1932462             0             0             0             0             0  \n",
       "183559              0             0             0             0             0  \n",
       "149511              0             0             0             0             0  \n",
       "74867               0             0             0             0             0  \n",
       "\n",
       "[5 rows x 167 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genre_id_df = pd.DataFrame(genre_id_matrix.todense(), columns = genre_id_columns, index = data_all_train.index, dtype='uint8')  \n",
    "print(genre_id_df.shape)\n",
    "genre_id_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "genre_ids memory usage: 1271.239462 Mb\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(7377418, 167)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "genre_id_mem = genre_id_df.memory_usage().sum()/(1024**2)\n",
    "print('genre_ids memory usage: %f Mb' % genre_id_mem)\n",
    "genre_id_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data_all_train memory usage: 1165.704651 Mb\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(7377418, 19)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train_mem = data_all_train.memory_usage().sum()/(1024**2)\n",
    "print('data_all_train memory usage: %f Mb' % data_all_train_mem)\n",
    "data_all_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open(model_path + 'data_all_train_v1.pkl','wb') as fw:\n",
    "#     pk.dump(data_all_train,fw)\n",
    "# fw.close()\n",
    "# 文件太大，存储报错(更改数据类型后，文件变为了原来的三分之一)\n",
    "with open(model_path + 'genre_id_df.pkl','wb') as fw:\n",
    "    pk.dump(genre_id_df,fw)\n",
    "fw.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# data_all_train = genre_id_df.merge(data_all_train,how ='left',left_index=True,right_index=True, copy = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 使用dataframe的merge函数报错，需要手动关联上, 对每个应该处理的列都统一处理(处理结束就存储)，最后批量关联上\n",
    "data_all_train = data_all_train.drop(['genre_ids'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train = data_all_train.drop(['artist_name','composer','lyricist','name'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>target</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>registration_init_time</th>\n",
       "      <th>expiration_date</th>\n",
       "      <th>song_length</th>\n",
       "      <th>language</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>gender_NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>20110525</td>\n",
       "      <td>20170911</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>gender_NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>20120102</td>\n",
       "      <td>20171005</td>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 msno  \\\n",
       "201969   FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1932462  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "183559   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "149511   Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "74867    FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                              song_id source_system_tab  \\\n",
       "201969   BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=           explore   \n",
       "1932462  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=        my library   \n",
       "183559   JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=        my library   \n",
       "149511   2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=        my library   \n",
       "74867    3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=           explore   \n",
       "\n",
       "          source_screen_name      source_type  target  city  bd      gender  \\\n",
       "201969               Explore  online-playlist       1     1   1  gender_NaN   \n",
       "1932462  Local playlist more   local-playlist       1    13   2      female   \n",
       "183559   Local playlist more   local-playlist       1    13   2      female   \n",
       "149511   Local playlist more   local-playlist       1    13   2      female   \n",
       "74867                Explore  online-playlist       1     1   1  gender_NaN   \n",
       "\n",
       "         registered_via  registration_init_time  expiration_date  song_length  \\\n",
       "201969                7                20120102         20171005            1   \n",
       "1932462               9                20110525         20170911            1   \n",
       "183559                9                20110525         20170911            1   \n",
       "149511                9                20110525         20170911            1   \n",
       "74867                 7                20120102         20171005            1   \n",
       "\n",
       "        language  \n",
       "201969        52  \n",
       "1932462       52  \n",
       "183559        52  \n",
       "149511        -1  \n",
       "74867         52  "
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7377418, 14)"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存 id 和 target_list \n",
    "msno_list = data_all_train.msno\n",
    "song_id_list = data_all_train.song_id\n",
    "target_list = data_all_train.target"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将 训练数据的 target 特征存储到硬盘 \n",
    "with open(model_path + 'target_list.pkl','wb') as fw:\n",
    "    pk.dump(target_list,fw)\n",
    "fw.close()\n",
    "# 将 训练数据的 msno 特征存储到硬盘 \n",
    "with open(model_path + 'msno_list.pkl','wb') as fw:\n",
    "    pk.dump(msno_list,fw)\n",
    "fw.close()\n",
    "# 将 训练数据的 song_id 特征存储到硬盘 \n",
    "with open(model_path + 'song_id_list.pkl','wb') as fw:\n",
    "    pk.dump(song_id_list,fw)\n",
    "fw.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train = data_all_train.drop(['msno','song_id','target'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 填充source_system_tab的缺失值\n",
    "data_all_train['source_system_tab']=data_all_train.source_system_tab.fillna('tab_NaN')\n",
    "# 填充 source_screen_name 的缺失值\n",
    "data_all_train['source_screen_name']=data_all_train.source_screen_name.fillna('screen_name_NaN')\n",
    "# 填充 source_type 的缺失值\n",
    "data_all_train['source_type']=data_all_train.source_type.fillna('source_type_NaN')\n",
    "# 填充 city 的缺失值\n",
    "data_all_train['city']=data_all_train.city.fillna('city_NaN')\n",
    "# 填充 gender 的缺失值\n",
    "data_all_train['gender']=data_all_train.gender.fillna('gender_NaN')\n",
    "# 填充 registered_via 的缺失值\n",
    "data_all_train['registered_via']=data_all_train.registered_via.fillna('via_NaN')\n",
    "# 填充 language 的缺失值\n",
    "data_all_train['language']=data_all_train.language.fillna('language_NaN')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train_copy = copy.deepcopy(data_all_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train = copy.deepcopy(data_all_train_copy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "201969     1\n",
       "1932462    1\n",
       "183559     1\n",
       "149511     1\n",
       "74867      1\n",
       "Name: target, dtype: int64"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "target_list[0:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(7377418, 11)"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_system_tab</th>\n",
       "      <th>source_screen_name</th>\n",
       "      <th>source_type</th>\n",
       "      <th>city</th>\n",
       "      <th>bd</th>\n",
       "      <th>gender</th>\n",
       "      <th>registered_via</th>\n",
       "      <th>song_length</th>\n",
       "      <th>language</th>\n",
       "      <th>reg_interval_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>gender_NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>my library</td>\n",
       "      <td>Local playlist more</td>\n",
       "      <td>local-playlist</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>female</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>explore</td>\n",
       "      <td>Explore</td>\n",
       "      <td>online-playlist</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>gender_NaN</td>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>52</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        source_system_tab   source_screen_name      source_type  city  bd  \\\n",
       "201969            explore              Explore  online-playlist     1   1   \n",
       "1932462        my library  Local playlist more   local-playlist    13   2   \n",
       "183559         my library  Local playlist more   local-playlist    13   2   \n",
       "149511         my library  Local playlist more   local-playlist    13   2   \n",
       "74867             explore              Explore  online-playlist     1   1   \n",
       "\n",
       "             gender  registered_via  song_length language reg_interval_days  \n",
       "201969   gender_NaN               7            1       52                 7  \n",
       "1932462      female               9            1       52                 8  \n",
       "183559       female               9            1       52                 8  \n",
       "149511       female               9            1       -1                 8  \n",
       "74867    gender_NaN               7            1       52                 7  "
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 7377418 entries, 201969 to 109449\n",
      "Data columns (total 10 columns):\n",
      "source_system_tab     category\n",
      "source_screen_name    category\n",
      "source_type           category\n",
      "city                  category\n",
      "bd                    category\n",
      "gender                category\n",
      "registered_via        category\n",
      "song_length           category\n",
      "language              category\n",
      "reg_interval_days     category\n",
      "dtypes: category(10)\n",
      "memory usage: 126.6 MB\n"
     ]
    }
   ],
   "source": [
    "data_all_train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# registration_init_time  expiration_date   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train['registration_init_time'] = data_all_train.registration_init_time.astype('str')\n",
    "data_all_train['expiration_date'] = data_all_train.expiration_date.astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train['registration_init_time_format'] = pd.to_datetime(data_all_train.registration_init_time, format='%Y%m%d')\n",
    "data_all_train['expiration_date_format'] = pd.to_datetime(data_all_train.expiration_date, format='%Y%m%d')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train['registration_interval_time'] = data_all_train.expiration_date_format - data_all_train.registration_init_time_format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train = data_all_train.drop(['registration_init_time','expiration_date','registration_init_time_format','expiration_date_format'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train['registration_interval_time_days'] = data_all_train.registration_interval_time.astype('timedelta64[D]')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3 days         141608\n",
       "700 days        36010\n",
       "608 days        32035\n",
       "303 days        31781\n",
       "7 days          30841\n",
       "395 days        29172\n",
       "334 days        27051\n",
       "364 days        26642\n",
       "701 days        26186\n",
       "578 days        24494\n",
       "609 days        23757\n",
       "639 days        22111\n",
       "456 days        21367\n",
       "669 days        21337\n",
       "670 days        20628\n",
       "273 days        19991\n",
       "577 days        19757\n",
       "731 days        18775\n",
       "792 days        17814\n",
       "640 days        17773\n",
       "548 days        17126\n",
       "396 days        16259\n",
       "730 days        16207\n",
       "457 days        15746\n",
       "761 days        14088\n",
       "486 days        12650\n",
       "365 days        12221\n",
       "413 days        12018\n",
       "1734 days       11713\n",
       "793 days        10889\n",
       "                ...  \n",
       "1972 days           6\n",
       "3708 days           6\n",
       "102 days            6\n",
       "3367 days           6\n",
       "3237 days           6\n",
       "3517 days           6\n",
       "4127 days           5\n",
       "4242 days           5\n",
       "4391 days           5\n",
       "2697 days           5\n",
       "3375 days           4\n",
       "4297 days           4\n",
       "2049 days           4\n",
       "4567 days           4\n",
       "3203 days           4\n",
       "4397 days           4\n",
       "-16191 days         3\n",
       "3124 days           3\n",
       "4151 days           3\n",
       "2037 days           3\n",
       "2827 days           2\n",
       "4067 days           2\n",
       "4088 days           2\n",
       "2971 days           1\n",
       "4657 days           1\n",
       "1999 days           1\n",
       "1977 days           1\n",
       "3130 days           1\n",
       "2016 days           1\n",
       "4534 days           1\n",
       "Name: registration_interval_time, Length: 4321, dtype: int64"
      ]
     },
     "execution_count": 90,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train.registration_interval_time.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x7f8e3f13ae10>"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD5CAYAAADLL+UrAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjAsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+17YcXAAATY0lEQVR4nO3df7DddX3n8edLgoI/SkAi4yZo4pqdFduK7l1g1+4OQjf8aht2Vipuq9EyzXSW3VZ3dzTuuIJad3E6s3RoC7NpoURrBfxVskCLGYSx2vLj8iuIaJMCSiaMhAZSXZUt8t4/zufCIdwf5yY39yb5PB8zd873+/5+vt/v53uSvM4nn/M996SqkCT14UUL3QFJ0vwx9CWpI4a+JHXE0Jekjhj6ktSRRQvdgekcffTRtXz58oXuhiQdUO68887Hq2rJZNv269Bfvnw54+PjC90NSTqgJPnOVNuc3pGkjhj6ktQRQ1+SOmLoS1JHDH1J6sh+ffeOtD9K8oKav7hQBwpH+tIsTBb409Wl/Y2hL0kdMfQlqSOGviR1xNCXpI6MFPpJHk5yX5J7koy32lFJNiXZ0h6PbPUkuSTJ1iSbk7xl6DhrWvstSdbsm0uSJE1lNiP9t1XV8VU11tbXATdV1UrgprYOcAawsv2sBS6DwYsEcAFwInACcMHEC4UkaX7szfTOamBDW94AnD1U/1QN3AosTvJq4DRgU1XtrKongE3A6XtxfknSLI0a+gV8OcmdSda22jFV9ShAe3xVqy8FHhnad1urTVV/niRrk4wnGd+xY8foVyJJmtGon8h9a1VtT/IqYFOSb03TdrJPqdQ09ecXqtYD6wHGxsb8mKMkzaGRRvpVtb09PgZ8icGc/PfatA3t8bHWfBtw7NDuy4Dt09QlSfNkxtBP8rIkr5hYBlYB3wA2AhN34KwBrm3LG4F3t7t4TgJ2temfG4FVSY5sb+CuajVJ0jwZZXrnGOBL7XeLLAL+tKr+IskdwDVJzgO+C5zT2t8AnAlsBX4IvBegqnYm+ThwR2v3saraOWdXIkmaUfbn3w44NjZWfkeu9ifT/WK1/fnfkvqS5M6h2+ufx0/kSlJHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUkVF/9450UJuLLzYf5Rjey6+FZuhLjB7GfjhLBzqndySpI4a+NAtTjeYd5etAYehLs1RVVBWv/eB1zy5LBwpDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjoycugnOSTJ3Umua+srktyWZEuSq5O8uNVf0ta3tu3Lh47xoVb/dpLT5vpiJEnTm81I/7eAB4bWPwlcXFUrgSeA81r9POCJqno9cHFrR5LjgHOBNwKnA5cmOWTvui9Jmo2RQj/JMuAs4I/aeoBTgM+3JhuAs9vy6rZO235qa78auKqqnqqqh4CtwAlzcRGSpNGMOtL/XeADwDNt/ZXAk1X1dFvfBixty0uBRwDa9l2t/bP1SfZ5VpK1ScaTjO/YsWMWlyJJmsmMoZ/kF4DHqurO4fIkTWuGbdPt81yhan1VjVXV2JIlS2bqniRpFhaN0OatwC8lORM4DPgpBiP/xUkWtdH8MmB7a78NOBbYlmQRcASwc6g+YXgfSdI8mHGkX1UfqqplVbWcwRuxX6mqXwFuBt7emq0Brm3LG9s6bftXqqpa/dx2d88KYCVw+5xdiSRpRqOM9KfyQeCqJL8N3A1c3uqXA59OspXBCP9cgKq6P8k1wDeBp4Hzq+one3F+SdIszSr0q+oW4Ja2/CCT3H1TVT8Gzpli/08An5htJyVJc8NP5EpSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOzBj6SQ5LcnuSe5Pcn+Sjrb4iyW1JtiS5OsmLW/0lbX1r27586FgfavVvJzltX12UJGlyo4z0nwJOqao3AccDpyc5CfgkcHFVrQSeAM5r7c8Dnqiq1wMXt3YkOQ44F3gjcDpwaZJD5vJiJEnTmzH0a+AHbfXQ9lPAKcDnW30DcHZbXt3WadtPTZJWv6qqnqqqh4CtwAlzchWSpJGMNKef5JAk9wCPAZuAvwWerKqnW5NtwNK2vBR4BKBt3wW8crg+yT7D51qbZDzJ+I4dO2Z/RZKkKY0U+lX1k6o6HljGYHT+hsmatcdMsW2q+u7nWl9VY1U1tmTJklG6J0ka0azu3qmqJ4FbgJOAxUkWtU3LgO1teRtwLEDbfgSwc7g+yT6SpHkwyt07S5IsbsuHAz8PPADcDLy9NVsDXNuWN7Z12vavVFW1+rnt7p4VwErg9rm6EEnSzBbN3IRXAxvanTYvAq6pquuSfBO4KslvA3cDl7f2lwOfTrKVwQj/XICquj/JNcA3gaeB86vqJ3N7OZKk6cwY+lW1GXjzJPUHmeTum6r6MXDOFMf6BPCJ2XdTkjQX/ESuJHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcWLXQHpH3hTR/9Mrt+9A/7/DzL112/T49/xOGHcu8Fq/bpOdQXQ18HpV0/+gcevuishe7GXtvXLyrqj9M7ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSMzhn6SY5PcnOSBJPcn+a1WPyrJpiRb2uORrZ4klyTZmmRzkrcMHWtNa78lyZp9d1mSpMmMMtJ/GvgvVfUG4CTg/CTHAeuAm6pqJXBTWwc4A1jZftYCl8HgRQK4ADgROAG4YOKFQpI0P2YM/ap6tKruasvfBx4AlgKrgQ2t2Qbg7La8GvhUDdwKLE7yauA0YFNV7ayqJ4BNwOlzejWSpGnNak4/yXLgzcBtwDFV9SgMXhiAV7VmS4FHhnbb1mpT1Xc/x9ok40nGd+zYMZvuSZJmMHLoJ3k58AXgfVX199M1naRW09SfX6haX1VjVTW2ZMmSUbsnSRrBSKGf5FAGgf+ZqvpiK3+vTdvQHh9r9W3AsUO7LwO2T1OXJM2TUe7eCXA58EBV/a+hTRuBiTtw1gDXDtXf3e7iOQnY1aZ/bgRWJTmyvYG7qtUkSfNklN+y+VbgXcB9Se5ptf8GXARck+Q84LvAOW3bDcCZwFbgh8B7AapqZ5KPA3e0dh+rqp1zchWSpJHMGPpV9TUmn48HOHWS9gWcP8WxrgCumE0HJUlzx0/kSlJHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6Mso3Z0kHnFe8YR0/s2HdQndjr73iDQBnLXQ3dBAx9HVQ+v4DF/HwRQd+WC5fd/1Cd0EHGad3JKkjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0JekjswY+kmuSPJYkm8M1Y5KsinJlvZ4ZKsnySVJtibZnOQtQ/usae23JFmzby5HkjSdUUb6VwKn71ZbB9xUVSuBm9o6wBnAyvazFrgMBi8SwAXAicAJwAUTLxSSpPkzY+hX1VeBnbuVVwMb2vIG4Oyh+qdq4FZgcZJXA6cBm6pqZ1U9AWzihS8kkqR9bE/n9I+pqkcB2uOrWn0p8MhQu22tNlVdkjSP5vqN3ExSq2nqLzxAsjbJeJLxHTt2zGnnJKl3exr632vTNrTHx1p9G3DsULtlwPZp6i9QVeuraqyqxpYsWbKH3ZMkTWZPQ38jMHEHzhrg2qH6u9tdPCcBu9r0z43AqiRHtjdwV7WaJGkezfh1iUk+C5wMHJ1kG4O7cC4CrklyHvBd4JzW/AbgTGAr8EPgvQBVtTPJx4E7WruPVdXubw5LkvaxGUO/qt45xaZTJ2lbwPlTHOcK4IpZ9U6SNKf8RK4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqyIy3bEoHquXrrl/oLuy1Iw4/dKG7oIOMoa+D0sMXnbXPz7F83fXzch5pLjm9I0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOjLvoZ/k9CTfTrI1ybr5Pr8k9WxeQz/JIcAfAGcAxwHvTHLcfPZBkno23yP9E4CtVfVgVf0/4Cpg9Tz3QZK6tWiez7cUeGRofRtw4nCDJGuBtQCvec1r5q9n6lqSPdvvk7NrX1V7dB5prsz3SH+yf1nP+1dQVeuraqyqxpYsWTJP3VLvqmpefqSFNt+hvw04dmh9GbB9nvsgSd2a79C/A1iZZEWSFwPnAhvnuQ+S1K15ndOvqqeT/EfgRuAQ4Iqqun8++yBJPZvvN3KpqhuAG+b7vJIkP5ErSV0x9CWpI4a+JHXE0JekjmR//sBIkh3Adxa6H9IUjgYeX+hOSJN4bVVN+unW/Tr0pf1ZkvGqGlvofkiz4fSOJHXE0Jekjhj60p5bv9AdkGbLOX1J6ogjfUnqiKEvSR0x9CWpI4a+9lqSX0qybprtxyc5cw+OuzzJvx9aH0tyyZ72c4pz/NUIbd6X5KVzed4pznNlkreP2o8kNyRZvK/71c51SxI/k3AQMPT1PBmY1d+LqtpYVRdN0+R4YNLQTzLdr/deDjwb+lU1XlW/OZu+zaSq/uUIzd4HzCr0kxyyZz0avR9VdWZVPbkPzqODmKGviRH1A0kuBe4C3pXkr5PcleRzSV7e2p2Z5FtJvpbkkiTXtfp7kvx+Wz4nyTeS3Jvkq+0b0j4GvCPJPUnekeTCJOuTfBn4VDv/X7bz3ZVkIogvAv5V2+/9SU4eOudRSf4syeYktyb52Va/MMkVbWT6YJJpXySS/KA9ntz2+Xy7xs+0F8DfBP4RcHOSm1vbVVM8Pw8n+UiSrwEfSHL7bs/x5rb8kSR3tOdpfUb4VvYp+vFwkqPbsb+V5I/aMT+T5OeTfD3JliQntPYva8/NHUnuTrJ6mvMdnuSq9vxeDRw+tO2yJONJ7k/y0VY7NcmXhtr8myRfTHJI+x/MN5Lcl+T9M12r9rH5+kJof/bfHwYj6meAkxj8PpmvAi9r2z4IfAQ4DHgEWNHqnwWua8vvAX6/Ld8HLG3Li3ff3tYvBO4EDm/rLwUOa8srgfG2fPLEOXZfB34PuKAtnwLcM3TsvwJe0q7l74BDp7n2HwwdexeD721+EfDXwM+1bQ8DR7flSZ+foXYfGDr2PcDrhtp9uC0fNdTm08AvtuUrgbdP09dn+zG83v78ngZ+pvX9TuAKIMBq4M9a+/8B/OrEnw3wNxPXMcm5/jODb7YD+Nl2/LHh/jP49rtb2vYA3wKWtG1/Cvwi8M+ATUPHXbzQf997/3GkrwnfqapbGQT/ccDXk9wDrAFeC/xT4MGqeqi1/+wUx/k6cGWSX2cQClPZWFU/asuHAn+Y5D7gc+38M/k5BoFJVX0FeGWSI9q266vqqap6HHgMOGaE4wHcXlXbquoZBoG9fJI2Uz0/E64eWr4G+OW2/I6hbW9Lclu73lOAN47Yv+k8VFX3tb7fD9xUg5S9b+g6VgHrWr9vYfBC/popjvevgT8BqKrNwOahbb+c5C7g7tb349q5Pg38anuf4V8Afw48CLwuye8lOR34+zm4Vu2Fef+6RO23/m97DIOR2TuHNyZ58ygHqarfSHIicBZwT5LjZzgfwPuB7wFvYjBS/fEIp5psSmTik4ZPDdV+wuh/z0fZb9LnZ8jwdV0NfC7JF4Gqqi1JDgMuZTBqfiTJhQzCd28N9/2ZofVneO46Avy7qvr2iMd8wSc3k6wA/ivwz6vqiSRX8lz//xj4Pwz+/D5XVU8DTyR5E3AacD6DF8FfG/WiNPcc6Wt3twJvTfJ6gCQvTfJPGPzX/XVJlrd275hs5yT/uKpuq6qPMPi1w8cC3wdeMc05jwAebaPUd/Hc/xCm2++rwK+0c54MPF5V+2oUOdyPqZ6fF6iqv2Xw4vHfeW6UPxGQj7f3Aqa8W2eGfuyJG4H/NPEewgwv5MPP708zmMIB+CkGL2y7khwDnDGxQ1VtB7YDH2YwVUWSo4EXVdUXGDwPb9mL/msOONLX81TVjiTvAT6b5CWt/OGq+psk/wH4iySPA7dPcYjfSbKSwajyJuBe4Ls8N63wPyfZ51LgC0nOAW7mudHyZuDpJPcyCJG7h/a5EPjj9uboDxlMs+wr64E/T/JoVb1tsueHwfz4ZK4GfgdYAVBVTyb5QwbTLg8Dd+xpP2Z/GXwc+F1gcwv+h4FfmKLtZTz3/N5D+/OuqnuT3M1gCulBBtN5wz7DYF7/m219aTvOxADzQ3vQb80hf/eORpbk5VX1gxYYfwBsqaqLF7pf2n9kcBfX3VV1+UL3RZNzekez8etttH4/gymZ/73A/dF+JMmdDKaB/mSh+6KpOdLXQS/JKxlMNe3u1Kr6u/nuz0za/e4rdit/sKpu3AfnOg345G7lh6rq3871ubR/MPQlqSNO70hSRwx9SeqIoS9JHTH0Jakj/x+mLK/ESRA5/wAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "data_all_train[data_all_train.registration_interval_time_days > 0.0].registration_interval_time_days.plot.box()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "bins = [min(data_all_train.registration_interval_time_days)-1,30,180,365,365*2,365*3,365*4,365*5,365*6,365*7,365*8,max(data_all_train.registration_interval_time_days)+1]\n",
    "labels = [0,1,2,3,4,5,6,7,8,9,10]\n",
    "result = pd.cut(data_all_train.registration_interval_time_days, bins = bins, right = False, labels = labels)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train['reg_interval_days'] = result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train = data_all_train.drop(['registration_interval_time','registration_interval_time_days'], axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['source_system_tab', 'source_screen_name', 'source_type', 'city', 'bd', 'gender', 'registered_via', 'song_length', 'language', 'reg_interval_days']\n"
     ]
    }
   ],
   "source": [
    "print(list(data_all_train.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "for column in data_all_train.columns:\n",
    "    data_all_train[column] = data_all_train[column].astype('category')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将 所有字段都转换成了类别特征的特征数据 存储到硬盘\n",
    "with open(model_path + 'data_all_train_v2.pkl','wb') as fw:\n",
    "    pk.dump(data_all_train,fw)\n",
    "fw.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_all_train_onehot = pd.get_dummies(data_all_train[list(data_all_train.columns)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 7377418 entries, 201969 to 109449\n",
      "Columns: 109 entries, source_system_tab_discover to reg_interval_days_10\n",
      "dtypes: uint8(109)\n",
      "memory usage: 823.2 MB\n"
     ]
    }
   ],
   "source": [
    "data_all_train_onehot.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>source_system_tab_discover</th>\n",
       "      <th>source_system_tab_explore</th>\n",
       "      <th>source_system_tab_listen with</th>\n",
       "      <th>source_system_tab_my library</th>\n",
       "      <th>source_system_tab_notification</th>\n",
       "      <th>source_system_tab_radio</th>\n",
       "      <th>source_system_tab_search</th>\n",
       "      <th>source_system_tab_settings</th>\n",
       "      <th>source_system_tab_tab_NaN</th>\n",
       "      <th>source_screen_name_Album more</th>\n",
       "      <th>...</th>\n",
       "      <th>reg_interval_days_1</th>\n",
       "      <th>reg_interval_days_2</th>\n",
       "      <th>reg_interval_days_3</th>\n",
       "      <th>reg_interval_days_4</th>\n",
       "      <th>reg_interval_days_5</th>\n",
       "      <th>reg_interval_days_6</th>\n",
       "      <th>reg_interval_days_7</th>\n",
       "      <th>reg_interval_days_8</th>\n",
       "      <th>reg_interval_days_9</th>\n",
       "      <th>reg_interval_days_10</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>201969</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1932462</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>183559</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>149511</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74867</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 109 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "         source_system_tab_discover  source_system_tab_explore  \\\n",
       "201969                            0                          1   \n",
       "1932462                           0                          0   \n",
       "183559                            0                          0   \n",
       "149511                            0                          0   \n",
       "74867                             0                          1   \n",
       "\n",
       "         source_system_tab_listen with  source_system_tab_my library  \\\n",
       "201969                               0                             0   \n",
       "1932462                              0                             1   \n",
       "183559                               0                             1   \n",
       "149511                               0                             1   \n",
       "74867                                0                             0   \n",
       "\n",
       "         source_system_tab_notification  source_system_tab_radio  \\\n",
       "201969                                0                        0   \n",
       "1932462                               0                        0   \n",
       "183559                                0                        0   \n",
       "149511                                0                        0   \n",
       "74867                                 0                        0   \n",
       "\n",
       "         source_system_tab_search  source_system_tab_settings  \\\n",
       "201969                          0                           0   \n",
       "1932462                         0                           0   \n",
       "183559                          0                           0   \n",
       "149511                          0                           0   \n",
       "74867                           0                           0   \n",
       "\n",
       "         source_system_tab_tab_NaN  source_screen_name_Album more  ...  \\\n",
       "201969                           0                              0  ...   \n",
       "1932462                          0                              0  ...   \n",
       "183559                           0                              0  ...   \n",
       "149511                           0                              0  ...   \n",
       "74867                            0                              0  ...   \n",
       "\n",
       "         reg_interval_days_1  reg_interval_days_2  reg_interval_days_3  \\\n",
       "201969                     0                    0                    0   \n",
       "1932462                    0                    0                    0   \n",
       "183559                     0                    0                    0   \n",
       "149511                     0                    0                    0   \n",
       "74867                      0                    0                    0   \n",
       "\n",
       "         reg_interval_days_4  reg_interval_days_5  reg_interval_days_6  \\\n",
       "201969                     0                    0                    0   \n",
       "1932462                    0                    0                    0   \n",
       "183559                     0                    0                    0   \n",
       "149511                     0                    0                    0   \n",
       "74867                      0                    0                    0   \n",
       "\n",
       "         reg_interval_days_7  reg_interval_days_8  reg_interval_days_9  \\\n",
       "201969                     1                    0                    0   \n",
       "1932462                    0                    1                    0   \n",
       "183559                     0                    1                    0   \n",
       "149511                     0                    1                    0   \n",
       "74867                      1                    0                    0   \n",
       "\n",
       "         reg_interval_days_10  \n",
       "201969                      0  \n",
       "1932462                     0  \n",
       "183559                      0  \n",
       "149511                      0  \n",
       "74867                       0  \n",
       "\n",
       "[5 rows x 109 columns]"
      ]
     },
     "execution_count": 140,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_all_train_onehot.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "823.171526 mb\n"
     ]
    }
   ],
   "source": [
    "data_all_train_onehot_mem = data_all_train_onehot.memory_usage().sum()/(1024**2)\n",
    "print(\"data_all_train_onehot memory_usage %f mb\" % data_all_train_onehot_mem)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将 类别特征onehot后 存储到硬盘\n",
    "# ['source_system_tab', 'source_screen_name', 'source_type', 'city', 'bd', 'gender', 'registered_via', 'song_length', 'language', 'reg_interval_days']\n",
    "with open(model_path + 'data_all_train_onehot.pkl','wb') as fw:\n",
    "    pk.dump(data_all_train_onehot,fw)\n",
    "fw.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
